# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from lpw.items import LpwItem


class LpwSpiderSpider(CrawlSpider):
    name = 'lpw_spider'
    allowed_domains = ['liepin.com']
    start_urls = ['https://www.liepin.com/zhaopin/?key=python']

    rules = (

        # 详情页
        Rule(LinkExtractor(allow=r'https://www.liepin.com/job/\d+\.shtml',
                           restrict_xpaths=['//ul[@class="sojob-list"]//a']),
                           callback='parse_job', follow=False),
        # 页码
        Rule(LinkExtractor(allow=r'/zhaopin/.+curPage=\d+',
                           restrict_xpaths=['//div[@class="pagerbar"]']),
                            follow=True),
    )

    def parse_job(self, response):

        title = response.css('.title-info h1::text').get()
        company = response.css('.title-info h3 a::text').get()
        li_city = response.css('.basic-infor span a::text').getall()
        city = ''.join(li_city).strip()
        edu = response.css('.job-qualifications span:nth-child(1)::text').get()
        work = response.css('.job-qualifications span:nth-child(2)::text').get()
        content = response.css('.content-word::text').getall()
        new_content = ''.join(content).strip()
        yield LpwItem(title=title,company=company,city=city,edu=edu,work=work,content=new_content)
